Slip 29

Q.1. Take iris flower dataset and reduce 4D data to 2D data using PCA. Then train the 
model and predict new flower with given measurements.

# Import libraries
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load iris dataset
iris = datasets.load_iris()
X = iris.data      # Features (4D: sepal length, sepal width, petal length, petal width)
y = iris.target    # Target (species)

# Reduce 4D → 2D using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Split data into train & test
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Train model (SVM classifier)
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Predict new flower measurement (example: [5.1, 3.5, 1.4, 0.2])
new_data = np.array([[5.1, 3.5, 1.4, 0.2]])
new_data_pca = pca.transform(new_data)
prediction = model.predict(new_data_pca)
print("Predicted Flower:", iris.target_names[prediction][0])

# Visualization (PCA 2D scatter plot)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("Iris Dataset after PCA (4D → 2D)")
plt.show()

Q.2. Use K-means clustering model and classify the employees into various income groups 
or clusters. Preprocess data if require (i.e. drop missing or null values). Use elbow 
method and Silhouette Score to find value of k.  

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# 2. Create Sample Employee Dataset
np.random.seed(42)
data = pd.DataFrame({
    'EmployeeID': range(1, 51),
    'Income': np.random.randint(20000, 120000, size=50),  # Annual income
    'Age': np.random.randint(22, 60, size=50),
    'Experience': np.random.randint(1, 35, size=50)
})

print("Sample Employee Dataset:")
print(data.head())

# 3. Data Preprocessing
X = data[['Income', 'Age', 'Experience']]  # Features for clustering

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Determine Optimal k
# (a) Elbow Method
wcss = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)  # Explicitly set n_init
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot Elbow
plt.figure(figsize=(8,5))
plt.plot(K, wcss, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.title('Elbow Method')
plt.show()

# (b) Silhouette Score
print("\nSilhouette Scores:")
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)  # Explicitly set n_init
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    print(f"k={k} -> Silhouette Score = {score:.3f}")

# Choose optimal k (based on Elbow + Silhouette)
k_opt = 3  # You can adjust based on Elbow & Silhouette

# 5. Apply K-Means Clustering
kmeans = KMeans(n_clusters=k_opt, random_state=42, n_init=10)  # Explicitly set n_init
labels = kmeans.fit_predict(X_scaled)
data['Cluster'] = labels

print("\nDataset with Cluster Labels:")
print(data.head())

# 6. Analyze Clusters
print("\nCluster-wise Mean Values:")
print(data.groupby('Cluster').mean(numeric_only=True))  # Add numeric_only=True to avoid warning in future

# 7. Visualize clusters in 2D using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=labels, cmap='viridis', edgecolor='k', s=50)
plt.title('K-Means Clustering of Employees (2D PCA)')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.show()

# 8. Correctly Label Clusters Based on Income
cluster_income = data.groupby('Cluster')['Income'].mean().sort_values()
income_labels = ['Low Income', 'Medium Income', 'High Income']
cluster_labels = {cluster: label for cluster, label in zip(cluster_income.index, income_labels)}
data['Income_Group'] = data['Cluster'].map(cluster_labels)

print("\nDataset with Income Group Labels:")
print(data.head())
